---
title: "Bill dataset comparison"
author: "Miles Quarterman"
date: "2023-10-06"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## R Markdown

## All bills
```{r}
library(tidyverse)
library("writexl")
all_bills <- read.csv("US-Legislative-congressional_bills_19.3_3_2 (1).csv")
all_bills <- subset(all_bills, all_bills$year > 2002)
all_bills <- subset(all_bills, all_bills$majortopic > 0)
all_bills_topics <- as.data.frame(table(all_bills$majortopic))
all_bills_topics$percent_freq <- 100*(all_bills_topics$Freq/67572)
all_bills_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands', 'Priv')
class(all_bills_topics$Var1) = "Numeric"
all_bills_topics[nrow(all_bills_topics) + 1,] <- list(23, 0, 0, 'Culture')

ggplot(all_bills_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of All Bills", x = "Topics", y= "Frequency (%)", caption = "(Priv topic is unique to All Bills)")
write_xlsx(all_bills_topics, "all_bills_topics.xlsx")
sum(all_bills_topics$percent_freq)
```
## Hearings
```{r}
hearings <- read.csv("US-Legislative_congressional_hearings-21.4.csv")
hearings <- subset(hearings, hearings$year > 2002)
hearings_topics <- as.data.frame(table(hearings$majortopic))
hearings_topics$percent_freq <- 100*(hearings_topics$Freq/22397)
hearings_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands', 'Culture')
ggplot(hearings_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of Hearings", x = "Topics", y= "Frequency (%)")
write_xlsx(hearings_topics, "hearings_topics.xlsx")
sum(hearings_topics$percent_freq)
```
## Public laws
```{r}
public_laws <- read.csv("US-Legislative-public_laws_20.1_7.csv")
public_laws <- subset(public_laws, public_laws$year > 2002)
public_laws_topics <- as.data.frame(table(public_laws$majortopic))
public_laws_topics$percent_freq <- 100*(public_laws_topics$Freq/3967)
public_laws_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands', 'Culture')
ggplot(public_laws_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of Public Laws", x = "Topics", y= "Frequency (%)")
write_xlsx(public_laws_topics, "public_laws_topics.xlsx")
sum(public_laws_topics$percent_freq)
```
## Roll call votes
```{r}
roll_call <- read.csv("rcv_voteview_v3_1 (2).csv")
roll_call <- subset(roll_call, roll_call$year > 2002)
roll_call <- subset(roll_call, roll_call$majortopic > 0)
roll_call_topics <- as.data.frame(table(roll_call$majortopic))
roll_call_topics$percent_freq <- 100*(roll_call_topics$Freq/19702)
roll_call_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands', 'Culture')
ggplot(roll_call_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of Roll Call Votes", x = "Topics", y= "Frequency (%)")
write_xlsx(roll_call_topics, "roll_call_topics.xlsx")
sum(roll_call_topics$percent_freq)
```
## Executive orders
```{r}
executive_orders <- read.csv("US-Executive-executive_orders_21.2.csv")
executive_orders <- subset(executive_orders, executive_orders$year > 2002)
executive_orders_topics <- as.data.frame(table(executive_orders$majortopic))
executive_orders_topics$percent_freq <- 100*(executive_orders_topics$Freq/699)
executive_orders_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands')
class(executive_orders_topics$Var1) = "Numeric"
executive_orders_topics[nrow(executive_orders_topics) + 1,] <- list(23, 0, 0, 'Culture')
ggplot(executive_orders_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of Executive Orders", x = "Topics", y= "Frequency (%)")
write_xlsx(executive_orders_topics, "executive_orders_topics.xlsx")
sum(executive_orders_topics$percent_freq)
```
## State of the union speeches
```{r}
sotu_speeches <- read.csv("US-Exec_SOTU_2023.csv")
sotu_speeches <- subset(sotu_speeches, sotu_speeches$year > 2002)
sotu_speeches <- subset(sotu_speeches, sotu_speeches$majortopic > 0)
sotu_speeches_topics <- as.data.frame(table(sotu_speeches$majortopic))
sotu_speeches_topics$percent_freq <- 100*(sotu_speeches_topics$Freq/5846)
sotu_speeches_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands')
class(sotu_speeches_topics$Var1) = "Numeric"
sotu_speeches_topics[nrow(sotu_speeches_topics) + 1,] <- list(23, 0, 0, "Culture")
ggplot(sotu_speeches_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of State of the Union Speeches", x = "Topics", y= "Frequency (%)")
write_xlsx(sotu_speeches_topics, "sotu_speeches_topics.xlsx")
sum(sotu_speeches_topics$percent_freq)
```
## Democratic party platform
```{r}
dem_platform  <- read.csv("Dem_partyplatform_20.1.csv")
dem_platform <- subset(dem_platform, dem_platform$year > 2002)
dem_platform <- subset(dem_platform, dem_platform$majortopic > 0)
dem_platform_topics <- as.data.frame(table(dem_platform$majortopic))
dem_platform_topics$percent_freq <- 100*(dem_platform_topics$Freq/5723)
dem_platform_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands', 'Culture', 'WX', 'Rel')
ggplot(dem_platform_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of the Democratic Party Platform", x = "Topics", y= "Frequency (%)", caption = "(Rel and WX topics are unique to Party Platforms)")
write_xlsx(dem_platform_topics, "dem_platform_topics.xlsx")
sum(dem_platform_topics$percent_freq)
```
## Republican party platform
```{r}
rep_platform <- read.csv("US-Party-Rep_Platform_19.2..csv")
rep_platform <- subset(rep_platform, rep_platform$year > 2002)
rep_platform <- subset(rep_platform, rep_platform$majortopic > 0)
rep_platform_topics <- as.data.frame(table(rep_platform$majortopic))
rep_platform_topics$percent_freq <- 100*(rep_platform_topics$Freq/5731)
rep_platform_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands')
class(rep_platform_topics$Var1) = "Numeric"
rep_platform_topics[nrow(rep_platform_topics) + 1,] <- list(23, 0, 0, 'Culture')
rep_platform_topics[nrow(rep_platform_topics) + 1,] <- list(26, 0, 0, 'WX')
rep_platform_topics[nrow(rep_platform_topics) + 1,] <- list(31, 0, 0, 'Rel')
ggplot(rep_platform_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of the Republican Party Platform", x = "Topics", y= "Frequency (%)", caption = "(Rel and WX topics are unique to Party Platforms)")
write_xlsx(rep_platform_topics, "rep_platform_topics.xlsx")
sum(rep_platform_topics$percent_freq)
```
## Gallup most important problem
```{r}
gallup <- read.csv("US-Public-Gallups_Most_Important_Problem-21.2.csv")
gallup <- subset(gallup, gallup$year > 2002)
gallup_topics <- as.data.frame(table(gallup$majortopic))
mean(gallup$percent)
percent_freq <- rep(0, 25)
for(i in 1:25){
  percent_freq[i] <- 100*mean(gallup$percent[gallup$majortopic == i])
}
gallup_topics$percent_freq <- na.omit(percent_freq)
gallup_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands', 'Other')
ggplot(gallup_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of Gallup Most Important Problem Responses", x = "Topics", y= "Frequency (%)", caption = "(Other topic is unique to Gallup and Culture topic was not posed as a question)")
write_xlsx(gallup_topics, "gallup_topics.xlsx")
```
## Mayhew
```{r}
mayhew_codes <- read.csv("Mayhew major topic codes.csv")
mayhew_topics <- as.data.frame(table(mayhew_codes$major.topic))
mayhew_topics$percent_freq <- 100*(mayhew_topics$Freq/124)
mayhew_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands')
class(mayhew_topics$Var1) = "Numeric"
mayhew_topics[nrow(mayhew_topics) + 1,] <- list(23, 0, 0, 'Culture')
ggplot(mayhew_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of Public Laws", x = "Topics", y= "Frequency (%)")
write_xlsx(mayhew_topics, "mayhew_topics.xlsx")
sum(mayhew_topics$percent_freq)
```
## CQ
```{r}
CQ_coding <- read.csv("CQ coding sheet.csv")
CQ_coding_new <- merge(CQ_coding, all_bills, by = c("bill_id"))
write_xlsx(CQ_coding_new, "CQ major topic codes.xlsx")
CQ <- read.csv("CQ topic codes.csv")
CQ_topics <- as.data.frame(table(CQ$major.topic))
CQ_topics$percent_freq <- 100*(CQ_topics$Freq/293)
sum(CQ_topics$percent_freq)
CQ_topics$Topics <- c('Econ', 'Civ Rights', 'Health', 'Ag', 'Labor', 'Edu', 'Env', 'Energy', 'Immigr', 'Trans', 'Crime', 'SW', 'Housing', 'Dom Comm', 'Def', 'Tech', 'Frgn Trade', 'Int Aff', 'Govt Ops', 'Publ Lands')
class(CQ_topics$Var1) = "Numeric"
CQ_topics[nrow(CQ_topics) + 1,] <- list(23, 0, 0, 'Culture')
ggplot(CQ_topics, aes(x = Topics,y = percent_freq)) +
  geom_col() +
  labs(title = "Topic Distribution of CQ Bills", x = "Topics", y= "Frequency (%)")
write_xlsx(CQ_topics, "CQ_topics.xlsx")
```

## CES vs roll call vs public laws 
```{r}
CES_questions_topics$id <- c('CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES')
public_laws_topics$id <- c('laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws','laws')
roll_call_topics$id <- c('roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll','roll')
CES_roll_call_public_laws <- rbind(CES_questions_topics, public_laws_topics, roll_call_topics)
ggplot(CES_roll_call_public_laws, aes(x = Topics,y = percent_freq, fill = id)) +
  geom_col(position=position_dodge()) +
  labs(title = "CES questions vs Roll Call Votes vs Public Laws", x = "Topics", y= "Frequency (%)")
```
## CES vs all bills
```{r}
CES_questions_topics$id <- c('CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES')
all_bills_topics_compare <- all_bills_topics[-21,]
all_bills_topics_compare$id <- c('bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills')
CES_all_bills <- rbind(CES_questions_topics, all_bills_topics_compare)
ggplot(CES_all_bills, aes(x = Topics,y = percent_freq, fill = id)) +
  geom_col(position=position_dodge()) +
  labs(title = "CES questions vs All Bills", x = "Topics", y= "Frequency (%)")
```
## CES vs gallup 
```{r}
CES_questions_topics$id <- c('CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES')
gallup_topics$id <- c('gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup','gallup')
gallup_topics_compare <- gallup_topics[-21,]
class(gallup_topics_compare$Var1) = "Numeric"
CES_gallup <- rbind(CES_questions_topics, gallup_topics_compare)
ggplot(CES_gallup, aes(x = Topics,y = percent_freq, fill = id)) +
  geom_col(position=position_dodge()) +
  labs(title = "CES questions vs Gallup", x = "Topics", y= "Frequency (%)")
```
## CES vs Mayhew vs Public Laws
```{r}
CES_questions_topics$id <- c('CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES')
mayhew_topics$id <- c('mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew','mayhew')
public_laws_topics$id <- c('PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL','PL')
CES_mayhew_public_laws <- rbind(CES_questions_topics, mayhew_topics, public_laws_topics)
ggplot(CES_mayhew_public_laws, aes(x = Topics,y = percent_freq, fill = id)) +
  geom_col(position=position_dodge()) +
  labs(title = "CES questions vs Mayhew Important Enactments vs Public Laws", x = "Topics", y= "Frequency (%)")
```
## CES vs CQ vs All Bills 
```{r}
CES_questions_topics$id <- c('CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES','CES')
all_bills_topics_compare <- all_bills_topics[-21,]
all_bills_topics_compare$id <- c('bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills','bills')
CQ_topics$id <- c('CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ','CQ')
CES__CQ_all_bills <- rbind(CES_questions_topics, CQ_topics, all_bills_topics_compare)
ggplot(CES__CQ_all_bills, aes(x = Topics,y = percent_freq, fill = id)) +
  geom_col(position=position_dodge()) +
  labs(title = "CES questions vs CQ vs All Bills", x = "Topics", y= "Frequency (%)")
```